Starting form the filtered table from ‘HMP_coverage.Rmd’. Run a series of analysis to look at relationships between body site and subjects.
print(date())
## [1] "Fri Sep 4 14:12:59 2015"
library(reshape2)
#library(igraph)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library(biomod2)
library(e1071)
library(RColorBrewer)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.3-0
library(assertthat)
source('./staph_metagenome_tools.R', echo=TRUE)
##
## > bintr <- function(mat, cutoff) {
## + mat[which(mat > cutoff)] <- 1
## + mat[which(!(mat > cutoff))] <- 0
## + return(mat)
## + }
##
## > calc_FTS <- function(pop, mini) {
## + fishmat <- matrix(c(mini[1], mini[2], pop[1] - mini[1], pop[2] -
## + mini[2]), ncol = 2, nrow = 2)
## + .... [TRUNCATED]
##
## > calc_hits <- function(nameset, mat) {
## + minimat <- select(mat, one_of(nameset))[rownames(mat) %in%
## + nameset, ]
## + minimat.size <- ( .... [TRUNCATED]
##
## > calc_hits_slice <- function(nameset, mat) {
## + minimat <- slice(mat, nameset)[, nameset]
## + minimat.hits <- sum(minimat)/2
## + return(minima .... [TRUNCATED]
##
## > create_cooccur_mat <- function(mat) {
## + library(reshape2)
## + dat2 <- melt(mat)
## + w <- dcast(dat2, V2 ~ V1)
## + x <- as.matrix(w[, -1])
## .... [TRUNCATED]
##
## > genotypes_plot <- function(mat, tit) {
## + top_genos <- c("CC_30", "CC_8", "CC_45", "CC_398", "CC_133",
## + "CC_59", "CC_15", "CC_97", "CC_ ..." ... [TRUNCATED]
##
## > all_genotypes_plot <- function(mat, tit) {
## + cS <- colSums(mat)
## + barplot(cS, main = tit, las = 3, cex.names = 0.8, col = "gray")
## + }
##
## > run_bs_subj_adonis <- function(df, bs_vec, subj_vec) {
## + library(e1071)
## + library(vegan)
## + body_site_adonis <- adonis(df ~ bs_vec)
## + .... [TRUNCATED]
##
## > make_subtype_matrix <- function(df) {
## + library(dplyr)
## + mat <- select(df, matches("CC")) %>% as.matrix
## + assert_that(dim(mat)[2] == 32) .... [TRUNCATED]
##
## > plot_coverages <- function(combined.df, titl) {
## + check_staph_df(combined.df)
## + par(mar = c(12, 4, 4, 2), cex = 0.8)
## + with(combined.df, .... [TRUNCATED]
##
## > plot_adjusted_coverages <- function(combined.df, titl) {
## + check_staph_df(combined.df)
## + stcols <- grep("CC|MLST", colnames(combined.df))
## + .... [TRUNCATED]
##
## > plot_mecA <- function(combined.df, titl) {
## + check_staph_df(combined.df)
## + with(combined.df, plot(Staph_cov, mecA_cov, col = Body.site,
## + .... [TRUNCATED]
##
## > plot_diversity_vers_cov <- function(combined.df, titl) {
## + library(vegan)
## + check_staph_df(combined.df)
## + stcols <- grep("CC|MLST", coln .... [TRUNCATED]
##
## > check_staph_df <- function(df) {
## + library(assertthat)
## + assert_that(length(grep("Body.site", colnames(df))) == 1)
## + assert_that(length( .... [TRUNCATED]
##
## > subject_perm <- function(df, multiSubjects, hamming_mat) {
## + library(gdata)
## + check_staph_df(df)
## + sub1.hits = 0
## + sub1.cells = 0
## + .... [TRUNCATED]
##
## > by_factor_perm <- function(bs, df, hamming_mat) {
## + check_staph_df(df)
## + for (i in bs) {
## + bss_rows <- which(df$Body.site == i)
## + .... [TRUNCATED]
##
## > intra_body_FTS <- function(body1, body2, df, multiSubjects,
## + u) {
## + library(dplyr)
## + check_staph_df(df)
## + temp.an <- filter(df, Bo .... [TRUNCATED]
##
## > merge_CCs <- function(in_data, CC) {
## + new_col <- select(in_data, matches(CC)) %>% rowSums()
## + in_data <- select(in_data, -(matches(CC)))
## + .... [TRUNCATED]
##
## > plot_CC_types <- function(CC, CCcol = "red", mat,
## + SRA_file, map11, map10, plotdir) {
## + library(RgoogleMaps)
## + crows <- which(mat[[CC] .... [TRUNCATED]
dat4 <- read.table("./Data/cov0.025")
#list of all subjects with more than one sample
multiSubjects <- count(dat4,Subject.Id) %>% filter(n > 1) %>% select(Subject.Id )
dat5 <- make_subtype_matrix(dat4)
#create Hamming dist matrices with and without cutof min value of 0.2
dat4$Subject.Id <- as.factor(dat4$Subject.Id)
dat6 <- make_subtype_matrix(dat4) %>% bintr(0.2) %>% hamming.distance %>% data.frame
dat8 <- make_subtype_matrix(dat4) %>% hamming.distance %>% data.frame
test for significant associations of subtype with with bodysite and subject. us e Hamming dist. matrix. Two levels, one with a beta cutoff for all samples > 0.2 and one without
set.seed(344098)
run_bs_subj_adonis(dat6,dat4$Body.site,dat4$Subject.Id)
##
## Call:
## adonis(formula = df ~ bs_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 1.0285 0.079116 3.1293 0.11701 0.001 ***
## Residuals 307 7.7617 0.025282 0.88299
## Total 320 8.7902 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = bs_vec)
##
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## anterior nares attached keratinized gingiva
## 15.375 10.909
## buccal mucosa hard palate
## 13.933 0.000
## left retroauricular crease palatine tonsil
## 15.540 12.800
## posterior fornnix right antecubital fossa
## 9.508 0.000
## right retroauricular crease saliva
## 18.481 0.000
## stool subgingival_plaque
## 10.459 0.000
## supragingival plaque tongue dorsum
## 13.960 16.537
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6 PCoA7
## 38142.284 32135.658 7543.185 3552.046 2344.080 1905.688 1345.445
## PCoA8
## 1006.212
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 13 2062.417 158.64744 5.149763 999 0.001
## Residuals 307 9457.671 30.80675 NA NA NA
##
## Call:
## adonis(formula = df ~ subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## subj_vec 107 3.5035 0.032743 1.3192 0.39857 0.005 **
## Residuals 213 5.2867 0.024820 0.60143
## Total 320 8.7902 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = subj_vec)
##
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## 1 2 3 4 5 6 7
## 8.958e+00 1.162e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 9.849e+00
## 8 9 10 11 12 13 15
## 2.500e+00 9.092e+00 2.366e+01 1.183e+01 1.543e+01 1.312e+01 1.723e+01
## 16 17 18 19 20 21 22
## 0.000e+00 0.000e+00 9.501e+00 0.000e+00 0.000e+00 8.958e+00 1.176e+01
## 23 24 25 26 27 28 29
## 0.000e+00 0.000e+00 1.473e+01 6.819e+00 1.169e+01 1.308e+01 1.320e+01
## 30 31 32 33 34 35 36
## 9.811e+00 9.899e+00 1.572e+01 1.477e+01 1.183e+01 1.266e+01 0.000e+00
## 37 38 39 40 41 42 43
## 1.664e+01 4.796e+00 1.536e+01 1.261e+01 1.488e+01 1.066e+01 1.119e+01
## 44 45 46 47 48 49 50
## 6.532e+00 0.000e+00 0.000e+00 0.000e+00 1.364e+01 0.000e+00 0.000e+00
## 51 52 53 54 55 56 57
## 0.000e+00 1.107e-13 0.000e+00 0.000e+00 1.447e+01 1.877e+01 1.170e+01
## 58 60 61 62 63 64 65
## 0.000e+00 1.502e+01 0.000e+00 0.000e+00 1.260e+01 0.000e+00 1.364e+01
## 66 67 68 69 70 71 72
## 1.098e+01 4.372e+00 1.380e+01 1.406e+01 1.042e+01 8.007e+00 1.693e+01
## 73 74 75 76 77 78 79
## 1.625e+01 6.620e+00 5.972e+00 7.681e+00 1.201e+01 8.419e+00 3.847e-14
## 80 81 82 83 84 85 86
## 1.099e+01 1.083e+01 4.849e-14 8.957e-14 2.981e+00 0.000e+00 1.686e+01
## 87 88 89 90 91 92 93
## 1.364e+01 0.000e+00 0.000e+00 0.000e+00 1.136e+01 1.150e+01 1.242e+01
## 94 95 96 97 98 99 100
## 1.883e+01 1.572e+01 0.000e+00 1.334e+01 0.000e+00 0.000e+00 1.167e+01
## 101 102 103 104 105 106 107
## 0.000e+00 1.050e+01 1.183e+01 0.000e+00 0.000e+00 1.300e+01 9.695e+00
## 108 109 110
## 1.364e+01 0.000e+00 8.958e+00
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6 PCoA7
## 38142.284 32135.658 7543.185 3552.046 2344.080 1905.688 1345.445
## PCoA8
## 1006.212
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 107 9890.30 92.43271 1.16733 999 0.206
## Residuals 213 16865.99 79.18303 NA NA NA
##
## Call:
## adonis(formula = df ~ bs_vec + subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 1.0285 0.079116 3.4197 0.11701 0.001 ***
## subj_vec 106 3.1115 0.029354 1.2688 0.35397 0.009 **
## Residuals 201 4.6502 0.023135 0.52902
## Total 320 8.7902 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
run_bs_subj_adonis(dat8,dat4$Body.site,dat4$Subject.Id)
##
## Call:
## adonis(formula = df ~ bs_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 2.7175 0.209036 13.274 0.35983 0.001 ***
## Residuals 307 4.8347 0.015748 0.64017
## Total 320 7.5522 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = bs_vec)
##
## No. of Positive Eigenvalues: 273
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## anterior nares attached keratinized gingiva
## 58.70 13.46
## buccal mucosa hard palate
## 14.91 0.00
## left retroauricular crease palatine tonsil
## 35.90 19.00
## posterior fornnix right antecubital fossa
## 11.88 0.00
## right retroauricular crease saliva
## 30.99 0.00
## stool subgingival_plaque
## 19.91 0.00
## supragingival plaque tongue dorsum
## 15.43 24.37
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6
## 857175.250 21855.143 12873.981 6676.322 4027.667 3102.656
## PCoA7 PCoA8
## 2676.566 2258.205
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 13 87726.96 6748.228 5.441166 999 0.021
## Residuals 307 380746.71 1240.217 NA NA NA
##
## Call:
## adonis(formula = df ~ subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## subj_vec 107 2.2068 0.020624 0.82183 0.29221 0.876
## Residuals 213 5.3454 0.025096 0.70779
## Total 320 7.5522 1.00000
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = subj_vec)
##
## No. of Positive Eigenvalues: 273
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## 1 2 3 4 5 6 7
## 1.539e+01 2.766e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 5.955e+01
## 8 9 10 11 12 13 15
## 8.147e+01 2.462e+01 4.899e+01 2.177e+01 5.392e+01 5.926e+01 2.727e+01
## 16 17 18 19 20 21 22
## 0.000e+00 0.000e+00 4.641e+01 0.000e+00 0.000e+00 1.135e+01 1.012e+01
## 23 24 25 26 27 28 29
## 0.000e+00 0.000e+00 4.007e+01 1.441e+01 3.130e+01 2.821e+01 2.056e+01
## 30 31 32 33 34 35 36
## 2.745e+01 1.785e+02 3.343e+01 1.766e+01 9.517e+01 3.924e+01 0.000e+00
## 37 38 39 40 41 42 43
## 5.463e+01 3.283e+01 4.120e+01 2.579e+01 3.220e+01 5.490e+01 1.308e+02
## 44 45 46 47 48 49 50
## 7.313e+01 0.000e+00 0.000e+00 0.000e+00 8.775e+00 0.000e+00 0.000e+00
## 51 52 53 54 55 56 57
## 0.000e+00 4.492e+01 0.000e+00 0.000e+00 2.800e+01 3.023e+01 2.951e+01
## 58 60 61 62 63 64 65
## 0.000e+00 2.050e+01 0.000e+00 0.000e+00 2.982e+01 0.000e+00 1.674e+01
## 66 67 68 69 70 71 72
## 1.784e+01 3.924e+01 1.266e+01 1.435e+01 1.878e+01 1.836e+01 1.955e+01
## 73 74 75 76 77 78 79
## 2.133e+01 1.389e+01 1.529e+01 2.324e+01 1.605e+01 8.174e+00 7.483e+00
## 80 81 82 83 84 85 86
## 1.489e+01 1.519e+01 3.288e-12 3.983e-12 2.653e+01 0.000e+00 1.299e+01
## 87 88 89 90 91 92 93
## 8.775e+00 0.000e+00 0.000e+00 0.000e+00 1.010e+01 3.750e+01 1.101e+01
## 94 95 96 97 98 99 100
## 2.305e+01 1.624e+01 0.000e+00 6.227e+01 0.000e+00 0.000e+00 1.364e+01
## 101 102 103 104 105 106 107
## 0.000e+00 2.092e+01 1.540e+01 0.000e+00 0.000e+00 2.235e+01 1.064e+01
## 108 109 110
## 8.775e+00 0.000e+00 1.497e+01
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6
## 857175.250 21855.143 12873.981 6676.322 4027.667 3102.656
## PCoA7 PCoA8
## 2676.566 2258.205
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 107 191149.5 1786.444 1.185151 999 0.271
## Residuals 213 321066.7 1507.356 NA NA NA
##
## Call:
## adonis(formula = df ~ bs_vec + subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 2.7175 0.209036 13.457 0.35983 0.001 ***
## subj_vec 106 1.7125 0.016155 1.040 0.22675 0.368
## Residuals 201 3.1222 0.015533 0.41342
## Total 320 7.5522 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test ffor whether intra-subject distance greater than intersubject
subject_perm(dat4,multiSubjects,dat6)
## Score for intraperson hits = 1388
## [1] "Quartlies for random distribution"
## 0% 25% 50% 75% 100%
## 1438 1521 1540 1558 1638
## Empirical p value = 0
#now look at the same test between body sites
bs <- levels(dat4$Body.site)
by_factor_perm(bs,dat4,dat6)
## [1] "anterior nares"
## [1] "Number of samples " "67"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 4024 4886 5066 5242 6086
## [1] 5112
## Empirical p value [1] 0.5675
##
## [1] "attached keratinized gingiva"
## [1] "Number of samples " "4"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 0 12 13 16 28
## [1] 8
## Empirical p value [1] 0.0512
##
## [1] "buccal mucosa"
## [1] "Number of samples " "54"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 2581 3148 3274 3406 3956
## [1] 2683
## Empirical p value [1] 0.001
##
## Zero samples in hard palate[1] "left retroauricular crease"
## [1] "Number of samples " "23"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 304 540 578 616 814
## [1] 636
## Empirical p value [1] 0.8506
##
## [1] "palatine tonsil"
## [1] "Number of samples " "6"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 5 29 34 39 61
## [1] 36
## Empirical p value [1] 0.617
##
## [1] "posterior fornnix"
## [1] "Number of samples " "9"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 16 74 82 92 136
## [1] 48
## Empirical p value [1] 0.0084
##
## Zero samples in right antecubital fossa[1] "right retroauricular crease"
## [1] "Number of samples " "28"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 535 816 865 915 1161
## [1] 1081
## Empirical p value [1] 0.9984
##
## Zero samples in saliva[1] "stool"
## [1] "Number of samples " "7"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 6 42 48 54 84
## [1] 30
## Empirical p value [1] 0.0347
##
## Zero samples in subgingival_plaque[1] "supragingival plaque"
## [1] "Number of samples " "37"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 1072 1450 1528 1604 1932
## [1] 1164
## Empirical p value [1] 0.001
##
## [1] "tongue dorsum"
## [1] "Number of samples " "82"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 6477.0 7376.0 7608.5 7842.0 8896.0
## [1] 7815
## Empirical p value [1] 0.726
presence_mat <- as.data.frame(bintr(dat5,0.2))
top_score_mat <- as.data.frame(bintr(dat5,0.5))
# png("~/Dropbox/ARTICLES_BY_TDR/2015-staph-metagenome/HMP_barchart.png",width=640, height =640, res = 75)
# dev.off()
genotypes_plot(presence_mat,"Top CCs, 0.025X cutff, subtypes present > 0.2")
genotypes_plot(top_score_mat,"Top CCs, 0.025X cutoff, subtypes present > 0.5")
all_genotypes_plot(presence_mat,"All CCs, 0.025X cutoff, subtypes present > 0.2")
all_genotypes_plot(top_score_mat,"All CCs, 0.025X cutoff, subtypes present > 0.5")
for (i in bs) {
bss_rows <- which(dat4$Body.site == i)
if(length(bss_rows) > 0) {
bs_df <- slice(presence_mat,bss_rows)
genotypes_plot(bs_df,paste("Present: ", i))
}
}
for (i in bs) {
bss_rows <- which(dat4$Body.site == i)
if(length(bss_rows) > 0) {
bs_df <- slice(top_score_mat,bss_rows)
genotypes_plot(bs_df,paste("Top score: ", i))
}
}
###PCA
par(mfrow=c(2,2))
pcobj <- prcomp(dat6)
tr_gray <- rgb(0.5,.5,.5,.15)
for (i in bs) {
prcols <- rep(tr_gray,nrow(dat6))
prcols[which(dat4$Body.site == i)] <- "red"
plot(pcobj$x,col = prcols, pch = 16, main = i)
}
for (i in multiSubjects$Subject.Id) {
sub_rows = which(dat4$Subject.Id == as.character(i))
if (length(sub_rows) > 3){
prcols <- rep(tr_gray,nrow(dat6))
prcols[sub_rows] <- "blue"
plot(pcobj$x,col = prcols, pch = 16, main = c("Subject",i))
}
}
###Session Info
sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] assertthat_0.1 vegan_2.3-0 lattice_0.20-33
## [4] permute_0.8-4 gdata_2.17.0 RColorBrewer_1.1-2
## [7] e1071_1.6-7 dplyr_0.4.2 reshape2_1.4.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.0 cluster_2.0.3 knitr_1.11 magrittr_1.5
## [5] MASS_7.3-44 R6_2.1.1 stringr_1.0.0 plyr_1.8.3
## [9] tools_3.2.1 parallel_3.2.1 grid_3.2.1 nlme_3.1-122
## [13] mgcv_1.8-7 DBI_0.3.1 htmltools_0.2.6 class_7.3-14
## [17] gtools_3.5.0 lazyeval_0.1.10 yaml_2.1.13 digest_0.6.8
## [21] Matrix_1.2-2 evaluate_0.7.2 rmarkdown_0.8 stringi_0.5-5